# Clear memory
rm(list = ls())

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  haven, tigris, tidyverse, data.table, readxl, tidylog, sf
)
options("tidylog.display" = NULL)
select <- dplyr::select

# County fips list
counties <- counties(cb = TRUE, year = 2020) %>%
  st_as_sf() %>%
  as_tibble() %>%
  dplyr::select(county = GEOID) %>% 
  mutate(county = ifelse(county == "12025", "12086", county))

# 1978-1991 non-attainment
pollutants_1978 <- read_xlsx("raw/greenbook/PHISTORY_1978.xlsx") %>% 
  distinct(pollutant) %>% 
  pull(pollutant)
nonattainment_1978 <- read_xlsx("raw/greenbook/PHISTORY_1978.xlsx") %>% 
  select(pollutant:pw_1991, -fivedigit) %>% 
  mutate(county = paste0(fips_state, fips_cnty)) %>%
  mutate(pollutant = case_when(
    pollutant == pollutants_1978[12] ~ "so2_71",
    pollutant == pollutants_1978[9] ~ "pm25_97",
    pollutant == pollutants_1978[1] ~ "o3_79",
    pollutant == pollutants_1978[2] ~ "o3_97",
    pollutant == pollutants_1978[5] ~ "pb_78",
    pollutant == pollutants_1978[10] ~ "pm25_06",
    pollutant == pollutants_1978[6] ~ "pb_08",
    pollutant == pollutants_1978[4] ~ "co_71",
    pollutant == pollutants_1978[8] ~ "pm10_87",
    pollutant == pollutants_1978[13] ~ "so2_10",
    pollutant == pollutants_1978[3] ~ "o3_08",
    pollutant == pollutants_1978[11] ~ "pm25_12",
    pollutant == pollutants_1978[7] ~ "no2_71",
  )) %>%
  select(county, pollutant, starts_with("pw_")) %>%
  distinct() %>% 
  pivot_longer(
    cols = starts_with("pw_"),
    names_to = "year",
    names_prefix = "pw_",
    values_to = "nonattainment"
  ) %>%
  distinct(county, pollutant, year, .keep_all = TRUE) %>%
  pivot_wider(
    names_from = "pollutant",
    names_prefix = "nonattainment_",
    values_from = "nonattainment"
  ) %>%
  mutate(across(contains("nonattainment"), ~ !is.na(.x))) %>%
  mutate(year = as.numeric(year)) %>% 
  mutate(nonattainment_o3_15 = NA)

# 1992-2020 non-attainment
pollutants_1992 <- read_xls("raw/greenbook/PHISTORY_1992.xls") %>% 
  distinct(pollutant) %>% 
  pull(pollutant)
nonattainment_1992 <- read_xls("raw/greenbook/PHISTORY_1992.xls") %>%
  mutate(county = paste0(fips_state, fips_cnty)) %>%
  mutate(pollutant = case_when(
    pollutant == pollutants_1992[4] ~ "so2_71",
    pollutant == pollutants_1992[5] ~ "pm25_97",
    pollutant == pollutants_1992[6] ~ "o3_79",
    pollutant == pollutants_1992[7] ~ "o3_97",
    pollutant == pollutants_1992[8] ~ "pb_78",
    pollutant == pollutants_1992[3] ~ "pm25_06",
    pollutant == pollutants_1992[9] ~ "pb_08",
    pollutant == pollutants_1992[1] ~ "co_71",
    pollutant == pollutants_1992[2] ~ "pm10_87",
    pollutant == pollutants_1992[11] ~ "o3_15",
    pollutant == pollutants_1992[12] ~ "so2_10",
    pollutant == pollutants_1992[10] ~ "o3_08",
    pollutant == pollutants_1992[13] ~ "pm25_12",
    pollutant == pollutants_1992[14] ~ "no2_71",
  )) %>%
  select(county, pollutant, starts_with("pw_")) %>%
  distinct() %>% 
  pivot_longer(
    cols = starts_with("pw_"),
    names_to = "year",
    names_prefix = "pw_",
    values_to = "nonattainment"
  ) %>%
  distinct(county, pollutant, year, .keep_all = TRUE) %>%
  pivot_wider(
    names_from = "pollutant",
    names_prefix = "nonattainment_",
    values_from = "nonattainment"
  ) %>%
  mutate(across(contains("nonattainment"), ~ !is.na(.x))) %>%
  mutate(year = as.numeric(year))

# bind together the two separate histories
nonattainment <- bind_rows(nonattainment_1978, nonattainment_1992) %>% 
  arrange(county, year)

# Generate full panel
full_counties <- list(
  county = counties$county,
  year = min(nonattainment$year, na.rm = T):max(nonattainment$year, na.rm = T)
) %>%
  cross_df() %>%
  left_join(nonattainment) %>%
  replace(., is.na(.), FALSE) %>%
  mutate(across(contains("nonattainment"), ~ as.numeric(.x)))

# Apply the corresponding standard for each year
full_counties <- full_counties %>%
  mutate(
    nonattainment_so2 = nonattainment_so2_71 + nonattainment_so2_10 > 0,
    nonattainment_pm25 = nonattainment_pm25_97 + nonattainment_pm25_06 + nonattainment_pm25_12 > 0,
    nonattainment_o3 = 
      nonattainment_o3_79 + nonattainment_o3_97 + nonattainment_o3_08 + nonattainment_o3_15 > 0,
    nonattainment_co = nonattainment_co_71 > 0,
    nonattainment_no2 = nonattainment_no2_71 > 0,
    nonattainment_pm10 = nonattainment_pm10_87 > 0
  ) %>%
  mutate(across(starts_with("nonattainment_"), ~ as.numeric(.x))) %>% 
  select(
    fips = county, year,
    nonattainment_so2, nonattainment_o3, nonattainment_pm10,
    nonattainment_no2, nonattainment_co
  ) %>%
  rowwise() %>%
  mutate(nonattainment = max(
    c(
      nonattainment_so2, nonattainment_o3, nonattainment_pm10,
      nonattainment_no2, nonattainment_co
    )
  )) %>% 
  ungroup()  

full_counties <- full_counties %>% 
  arrange(fips, year) %>% 
  mutate(nonattainment_o3 = ifelse(nonattainment_o3 == 1 | nonattainment_no2 == 1, 1, 0)) %>% 
  select(-nonattainment_no2) 

extra_years <- crossing(
  fips = unique(full_counties$fips),
  year = 1950:(min(full_counties$year)-1)
)

full_counties <- extra_years %>% 
  bind_rows(full_counties) 

full_counties[is.na(full_counties)] = 0

full_counties = full_counties %>%
  group_by(fips, year) %>% 
  mutate(sum_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )
  ) %>% 
  ungroup()

fwrite(full_counties, "data/greenbook/nonattainment_status.csv")
